The imputation of missing values in clinical variables will be done using the MICE (Multiple Imputation by Chained Equations) method.

The R package used to implement it is called ‘mice’.

https://www.rdocumentation.org/packages/mice/versions/3.16.0/topics/mice

# Deactivate scientific notation
options(scipen = 999)
# Load libraries
pacman::p_load(
  an9elproject,
  tidyverse,
  lubridate,
  magrittr,
  naniar,
  mice,
  mctest,
  plotly,
  install = FALSE, update = FALSE
  )

Load data

# Load cohort database
oncoth1 = get_project("oncothr1", version = "0.0.8003")
# Get data slot
oncoth1_data = oncoth1$data

Inspecting missing values throughout the data set

# How many variables contain missing values?
n_var_miss(oncoth1_data)
## [1] 342

Let’s see which of the clinical variables have missing values and which should be imputed to gain statistical power.

# Visualize % of missing values by variable
oncoth1_data %>%
  # Exclude these variables
  select(!c(n_appointment_patient_became_case, 
            patient_left_study, 
            anticoag_tx_lmwh_dosage, 
            anticoag_tx_vte_drugs, 
            vte_before_entering_study)) %>%
  # Exclude free text variables
  select(!contains("reason")) %>%
  # Exclude date variables
  select(!contains("date")) %>%
  # Exclude variables relative to VTE
  select(!starts_with("type_")) %>% # VTE type
  select(!starts_with("cancer_concomitant_")) %>% # VTE concomitant to cancer diagnosis
  select(!starts_with("eval_")) %>% # VTE diagnosed during study
  select(!contains("_study")) %>%
  select(!contains("vte_dx_during_cancer_follow_up")) %>%
  select(!starts_with("rec_")) %>% # VTE recurrences
  select(!ends_with("_type_recurrence")) %>%
  # Exclude variables with time until event
  select(!starts_with("tu_")) %>%
  # Exclude genetic variables
  select(!starts_with("rs")) %>%
  # Exclude variables related to ONCOTHROMB score
  select(!c(starts_with("ONCOTHROMB"), "GRS")) %>%
  # Exclude variables with no missing values
  select_if( ~ any(is.na(.))) %>%
  # Plot % of missing values
  gg_miss_var(., show_pct = TRUE)

Exploration of missing values distribution

# Summary of dataset, including percentage of missing values
naniar_summary = miss_var_summary(oncoth1_data)

# Show variables with missing values 
naniar_summary_missing_values = naniar_summary %>%
  filter(!pct_miss == 0)
# Heatmap with missing data patterns
# Useful for investigating any structure of missing observations in the data
md.pattern(
  oncoth1_data %>% select(
    # Select clinical variables of interest
    albumin, 
    aptt, 
    aptt_ratio,
    bilirubin, 
    creatinine,
    alkaline_phosphatase,
    tobacco_use,
    copd,
    venous_insufficiency, 
    catheter_device, 
    creatinine, 
    oral_contraceptive_tx), 
  plot = TRUE,
  rotate.names = TRUE
  )

##     creatinine copd venous_insufficiency catheter_device oral_contraceptive_tx
## 185          1    1                    1               1                     1
## 34           1    1                    1               1                     1
## 9            1    1                    1               1                     1
## 36           1    1                    1               1                     1
## 9            1    1                    1               1                     1
## 15           1    1                    1               1                     1
## 5            1    1                    1               1                     1
## 49           1    1                    1               1                     1
## 2            1    1                    1               1                     1
## 5            1    1                    1               1                     1
## 4            1    1                    1               1                     1
## 3            1    1                    1               1                     1
## 13           1    1                    1               1                     1
## 1            1    1                    1               1                     1
## 1            1    1                    1               1                     1
## 4            1    1                    1               1                     1
## 1            1    1                    1               1                     1
## 5            1    1                    1               1                     1
## 2            1    1                    1               1                     1
## 1            1    1                    1               1                     1
## 1            1    1                    1               1                     1
## 1            1    1                    1               1                     0
## 1            1    1                    1               0                     1
## 1            1    1                    0               1                     1
## 1            1    0                    1               1                     1
## 1            0    1                    1               1                     1
##              1    1                    1               1                     1
##     tobacco_use bilirubin alkaline_phosphatase albumin aptt aptt_ratio    
## 185           1         1                    1       1    1          1   0
## 34            1         1                    1       1    1          0   1
## 9             1         1                    1       1    0          1   1
## 36            1         1                    1       1    0          0   2
## 9             1         1                    1       0    1          1   1
## 15            1         1                    1       0    1          0   2
## 5             1         1                    1       0    0          1   2
## 49            1         1                    1       0    0          0   3
## 2             1         1                    0       1    1          0   2
## 5             1         1                    0       1    0          0   3
## 4             1         1                    0       0    1          0   3
## 3             1         1                    0       0    0          1   3
## 13            1         1                    0       0    0          0   4
## 1             1         0                    1       0    0          0   4
## 1             1         0                    0       1    0          0   4
## 4             1         0                    0       0    1          0   4
## 1             1         0                    0       0    0          1   4
## 5             1         0                    0       0    0          0   5
## 2             0         1                    1       1    1          1   1
## 1             0         1                    1       1    0          0   3
## 1             0         0                    0       0    0          0   6
## 1             1         1                    1       0    1          0   3
## 1             1         1                    0       1    0          0   4
## 1             1         1                    1       0    0          0   4
## 1             1         1                    1       1    0          0   3
## 1             1         1                    1       1    0          0   3
##               4        13                   40     112  134        176 484

All these variables have missing values missing not at random (MNAR); except aptt and aptt_ratio.

Variable subset for imputation

Virtually, every dataset contains some parts that could better be removed before imputation. This includes, but is not limited to, uninteresting variables with a high proportion of missing data, variables without a code for the missing data, administrative variables, constant variables, duplicated, recoded or standardized variables, and aggregates and indices of other information.

# Select variables for imputation
data_for_imputation = oncoth1_data %>%
  select(id, 
         patient_group,
         patient_status_at_end_study,
         age_when_cancer_dx, 
         gender, 
         menopausal_status, 
         pregnancy,
         oral_contraceptive_tx, 
         weight,
         height,
         body_surface_area, # collinear with bmi and somewhat with creatinine
         bmi_value,
         bmi_category, #
         performance_status_category_corrected, 
         albumin, 
         aptt,
         aptt_ratio, 
         bilirubin, 
         creatinine, 
         alkaline_phosphatase, 
         hemoglobin, 
         inr, 
         leukocytes, 
         platelets,
         leukocytosis, 
         thrombocytosis, 
         low_hemoglobin, 
         tobacco_use,
         copd,
         venous_insufficiency,
         primary_tumor_simplified,
         progression_according_to_clinical_stage, 
         tnm_stage,
         t_stage, 
         n_stage, 
         histology_type, 
         mucinous_histology, 
         grade_histological_differentiation, 
         metastasis_dx,
         n_metastases,
         catheter_device, 
         khorana_risk_score, 
         tic_onco
)
# Change data
data_for_imputation %<>%
  # Get first word of patient status
  # We only want to know if they are alive, dead or unkown
  mutate(patient_status_at_end_study = word(patient_status_at_end_study, 1)) %>%
  # Convert to factor
  mutate(patient_status_at_end_study = as.factor(patient_status_at_end_study))
# Look for collinear data 
# Compute correlation of numeric variables
numeric_collinearity = cor(
  Filter(is.numeric, data_for_imputation), 
  use = "pairwise.complete.obs"
  )

# Show heatmap and dendrogram
heatmap(numeric_collinearity)

# Diagonal correlation plot for numeric variables
Filter(is.numeric, data_for_imputation) %>% # Get only numeric variables
  cor(use = "pairwise.complete.obs") %>% # Calculate correlation
  ggcorrplot::ggcorrplot(
    show.diag = FALSE, 
    type = "lower", 
    lab = TRUE, 
    lab_size = 2, 
    tl.cex = 10
    )

# Correlation plot between numeric and categorical variables
data_for_imputation %>% 
  select(-c(
    id, 
    patient_group,
    patient_status_at_end_study,
    bmi_category,
    oral_contraceptive_tx, 
    pregnancy,
    low_hemoglobin,
    leukocytosis, 
    thrombocytosis,
    t_stage, 
    n_stage, 
    metastasis_dx
    )) %>% # Remove ID and redundant variables
  # Transform some factors into numeric
  mutate(across(c(
    performance_status_category_corrected, 
    khorana_risk_score, 
    n_metastases), ~ as.numeric(.x))) %>%
  # Transform histological grade
  mutate(grade_histological_differentiation = case_when(
    grade_histological_differentiation == "Well differentiated" ~ 1,
    grade_histological_differentiation == "Moderately differentiated" ~ 2,
    grade_histological_differentiation == "Poorly differentiated"~ 3)) %>%
  # Filter(is.factor, .) %>% # Get only factors
  model.matrix(~0+., data = .) %>% # One-hot encoding (no intercept)
  cor(use = "pairwise.complete.obs") %>% 
  as.data.frame() %>% # Convert matrix to data frame
  # Create a lower triangular matrix
  {lower_tri <- function(m) { 
    m[lower.tri(m, diag = FALSE)] <- NA
    m
  }}() %>%
  # Create an interactive heatmap with Plotly
  plot_ly(
    x = colnames(.),
    y = rownames(.),
    z = as.matrix(.),
    type = "heatmap",
    colors = colorRamp(c("blue", "white", "red")), # Adjust color scale as needed
    colorbar = list(title = "Correlation")
  )

Visualize influx and outflux in data set

The influx of a variable quantifies how well its missing data connects to the observed data on other variables. The outflux of a variable quantifies how well its observed data connect to the missing data on other variables. Variables with higher outflux are (potentially) the more powerful predictors. Variables with higher influx depend strongly on the imputation model.

# Calculate flux
dataset_flux = flux(data_for_imputation)
# Plot influx/outflux
fluxplot(data_for_imputation, ylim = c(0, 1.05), cex = 0.7, eqscplot = TRUE)

The group at the left-upper corner has (almost) complete information, so the number of missing data problems for this group is relatively small. The intermediate group has an outflux between 0.5 and 0.8, which is small. Missing data problems are more severe, but potentially this group could contain important variables. The third group has an outflux with 0.5 and lower, so its predictive power is limited.

Variables that might cause problems later on in the imputations are located in the lower-right corner.

Most points are relatively close to the diagonal, which indicates that influx and outflux are balanced.

Imputing missing values with MICE

Useful links:

https://cran.r-project.org/web/packages/finalfit/vignettes/missing.html

https://datascienceplus.com/handling-missing-data-with-mice-package-a-simple-approach/

# Initialise MICE imputation creating a mids object
init = mice(data_for_imputation, maxit = 0, seed = 2828) 
## Warning: Number of logged events: 3
# Imputation method for each variable
# Detects oral_contraceptive_tx as collinear and eliminates it from imputation
meth = init$method
# Predictor matrix
predM = init$predictorMatrix

As a general rule, using every bit of available information yields multiple imputations that have minimal bias and maximal efficiency.

It is often beneficial to choose as large a number of predictors as possible. Including as many predictors as possible tends to make the missing at random (MAR) assumption more plausible, thus reducing the need to make special adjustments for MNAR mechanisms.

For imputation purposes, it is expedient to select a suitable subset of data that contains no more than 15 to 25 variables.

# Show imputation method that will be used for each of the variables in the dataset
# Variables with no missing data will be assigned no method
# Yes/no variables are imputed using logistic regression
# Categorical variables with more than two levels are imputed with 'polyreg' (polytomous logistic regression)
# Numerical variables are imputed using 'pmm' (predictive mean matching)
print(meth)
##                                      id                           patient_group 
##                                      ""                                      "" 
##             patient_status_at_end_study                      age_when_cancer_dx 
##                                      ""                                      "" 
##                                  gender                       menopausal_status 
##                                      ""                               "polyreg" 
##                               pregnancy                   oral_contraceptive_tx 
##                                "logreg"                                      "" 
##                                  weight                                  height 
##                                      ""                                      "" 
##                       body_surface_area                               bmi_value 
##                                      ""                                      "" 
##                            bmi_category   performance_status_category_corrected 
##                                      ""                               "polyreg" 
##                                 albumin                                    aptt 
##                                   "pmm"                                   "pmm" 
##                              aptt_ratio                               bilirubin 
##                                   "pmm"                                   "pmm" 
##                              creatinine                    alkaline_phosphatase 
##                                   "pmm"                                   "pmm" 
##                              hemoglobin                                     inr 
##                                      ""                                   "pmm" 
##                              leukocytes                               platelets 
##                                   "pmm"                                      "" 
##                            leukocytosis                          thrombocytosis 
##                                "logreg"                                      "" 
##                          low_hemoglobin                             tobacco_use 
##                                      ""                               "polyreg" 
##                                    copd                    venous_insufficiency 
##                                "logreg"                                "logreg" 
##                primary_tumor_simplified progression_according_to_clinical_stage 
##                                      ""                               "polyreg" 
##                               tnm_stage                                 t_stage 
##                                      ""                               "polyreg" 
##                                 n_stage                          histology_type 
##                               "polyreg"                               "polyreg" 
##                      mucinous_histology      grade_histological_differentiation 
##                                "logreg"                               "polyreg" 
##                           metastasis_dx                            n_metastases 
##                                      ""                                      "" 
##                         catheter_device                      khorana_risk_score 
##                                "logreg"                               "polyreg" 
##                                tic_onco 
##                                "logreg"
# Use of quickpred to obtain prediction matrix
# Selects predictors of variables to be imputed according to simple statistics
# Produces square matrix, with 0/1 values
# One is used to indicate that a specific variable will be used as predictor for another one
predM = quickpred(
  data_for_imputation, 
  mincor = 0.2,
  minpuc = 0.5, 
  include = c("patient_group", "patient_status_at_end_study")
  )
# Tweak prediction matrix by setting to zero uninformative variables
# These will not be used as predictors for imputation
predM[, c("id",
          #"gender", # Do not remove if you want post-imputation to work correctly
          "albumin",
          "inr",
          "aptt",
          "aptt_ratio", 
          "oral_contraceptive_tx",
          "grade_histological_differentiation",
          "t_stage", 
          "metastasis_dx", 
          "n_metastases")] = 0
# Distribution of number of predictions
table(rowSums(predM))
## 
##  0  2  3  4  5  6  7  8  9 10 11 
## 18  4  3  5  2  1  3  3  1  2  1
# The names of the predictors for any give variable can be obtained by
names(data_for_imputation)[predM["copd", ] == 1]
## [1] "patient_group"               "patient_status_at_end_study"
## [3] "tobacco_use"

This means that the predictors for imputing missing values in copd will be patient_group, patient_status_at_end_study and tobacco_use.

# Order in which the MICE algorithm will go through the variables
visit = init$visitSequence
visit
##  [1] "id"                                     
##  [2] "patient_group"                          
##  [3] "patient_status_at_end_study"            
##  [4] "age_when_cancer_dx"                     
##  [5] "gender"                                 
##  [6] "menopausal_status"                      
##  [7] "pregnancy"                              
##  [8] "oral_contraceptive_tx"                  
##  [9] "weight"                                 
## [10] "height"                                 
## [11] "body_surface_area"                      
## [12] "bmi_value"                              
## [13] "bmi_category"                           
## [14] "performance_status_category_corrected"  
## [15] "albumin"                                
## [16] "aptt"                                   
## [17] "aptt_ratio"                             
## [18] "bilirubin"                              
## [19] "creatinine"                             
## [20] "alkaline_phosphatase"                   
## [21] "hemoglobin"                             
## [22] "inr"                                    
## [23] "leukocytes"                             
## [24] "platelets"                              
## [25] "leukocytosis"                           
## [26] "thrombocytosis"                         
## [27] "low_hemoglobin"                         
## [28] "tobacco_use"                            
## [29] "copd"                                   
## [30] "venous_insufficiency"                   
## [31] "primary_tumor_simplified"               
## [32] "progression_according_to_clinical_stage"
## [33] "tnm_stage"                              
## [34] "t_stage"                                
## [35] "n_stage"                                
## [36] "histology_type"                         
## [37] "mucinous_histology"                     
## [38] "grade_histological_differentiation"     
## [39] "metastasis_dx"                          
## [40] "n_metastases"                           
## [41] "catheter_device"                        
## [42] "khorana_risk_score"                     
## [43] "tic_onco"
# Post-imputation
post <- init$post

# Gender-specific variables
# If gender is "Male", use value "Male"
post["menopausal_status"] <- "imp[[j]][data$gender[!r[, j]] == 'Male', i] <- 'Male'"
post["pregnancy"] <- "imp[[j]][data$gender[!r[, j]] == 'Male', i] <- 'Male'"
post["oral_contraceptive_tx"] <- "imp[[j]][data$gender[!r[, j]] == 'Male', i] <- 'Male'"

# Leukocytes' levels
# Patients with leukocytosis will have 11,000x10^9/L
post["leukocytes"] <- "imp[[j]][data$leukocytosis[!r[, j]] == 'Yes', i] <- 11000"
# Patients with more than 11,000x10^9/L will have leukocytosis
post["leukocytosis"] <- "imp[[j]][data$leukocytes[!r[, j]] > 11000, i] <- 'Yes'"
# Run imputation process
imputed = mice(
  data = data_for_imputation, 
  method = meth, 
  predictorMatrix = predM, 
  visitSequence = visit,
  post = post, # Apply post-processing changes
  m = 10, # 10 rounds of multiple imputation
  seed = 2828
  )
## 
##  iter imp variable
##   1   1  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   2  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   3  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   4  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   5  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   6  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   7  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   8  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   9  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   1   10  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   1  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   2  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   3  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   4  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   5  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   6  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   7  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   8  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   9  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   2   10  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   1  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   2  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   3  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   4  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   5  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   6  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   7  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   8  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   9  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   3   10  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   1  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   2  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   3  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   4  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   5  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   6  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   7  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   8  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   9  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   4   10  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   1  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   2  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   3  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   4  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   5  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   6  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   7  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   8  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   9  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
##   5   10  menopausal_status  pregnancy  performance_status_category_corrected  albumin  aptt  aptt_ratio  bilirubin  creatinine  alkaline_phosphatase  inr  leukocytes  leukocytosis  tobacco_use  copd  venous_insufficiency  progression_according_to_clinical_stage  t_stage  n_stage  histology_type  mucinous_histology  grade_histological_differentiation  catheter_device  khorana_risk_score  tic_onco
## Warning: Number of logged events: 151

Inspect data resulting from imputation

# Plots for inspecting imputation process through iterations
plot(imputed, c(
  "menopausal_status",
  "performance_status_category_corrected",
  "albumin",
  "aptt",
  "aptt_ratio",
  "bilirubin",
  "creatinine",
  "alkaline_phosphatase",
  "inr",
  "leukocytes",
  "leukocytosis",
  "pregnancy",
  # "oral_contraceptive_tx", # not found because no imputation is done on this variable
  "tobacco_use",
  "venous_insufficiency",
  "progression_according_to_clinical_stage",
  "t_stage",
  "n_stage",
  "histology_type",
  "mucinous_histology",
  "grade_histological_differentiation",
  "catheter_device",
  "khorana_risk_score",
  "tic_onco"
  ))

# Return completed data after imputation
whole_imputed_data = complete(imputed)
# Plot missing data patterns in imputed dataset 
# No missing values should be present
md.pattern(whole_imputed_data, rotate.names = TRUE)

##     id patient_group patient_status_at_end_study age_when_cancer_dx gender
## 389  1             1                           1                  1      1
## 1    1             1                           1                  1      1
##      0             0                           0                  0      0
##     menopausal_status pregnancy weight height body_surface_area bmi_value
## 389                 1         1      1      1                 1         1
## 1                   1         1      1      1                 1         1
##                     0         0      0      0                 0         0
##     bmi_category performance_status_category_corrected albumin aptt aptt_ratio
## 389            1                                     1       1    1          1
## 1              1                                     1       1    1          1
##                0                                     0       0    0          0
##     bilirubin creatinine alkaline_phosphatase hemoglobin inr leukocytes
## 389         1          1                    1          1   1          1
## 1           1          1                    1          1   1          1
##             0          0                    0          0   0          0
##     platelets leukocytosis thrombocytosis low_hemoglobin tobacco_use copd
## 389         1            1              1              1           1    1
## 1           1            1              1              1           1    1
##             0            0              0              0           0    0
##     venous_insufficiency primary_tumor_simplified
## 389                    1                        1
## 1                      1                        1
##                        0                        0
##     progression_according_to_clinical_stage tnm_stage t_stage n_stage
## 389                                       1         1       1       1
## 1                                         1         1       1       1
##                                           0         0       0       0
##     histology_type mucinous_histology grade_histological_differentiation
## 389              1                  1                                  1
## 1                1                  1                                  1
##                  0                  0                                  0
##     metastasis_dx n_metastases catheter_device khorana_risk_score tic_onco
## 389             1            1               1                  1        1
## 1               1            1               1                  1        1
##                 0            0               0                  0        0
##     oral_contraceptive_tx  
## 389                     1 0
## 1                       0 1
##                         1 1

For some reason, there is one row where oral_contraceptive_tx still has one missing value. This has probably something to do with the fact that this variable is constant (no patient had oral contraceptives during the study) and no imputation method was asigned.

# Plot density of numerical variable distributions -- both observed and imputed data
densityplot(imputed)

Curves resulting from the iterations of imputation process are overall quite similar to the observed data (although there is some variability). This is a good sign.

Review all imputation results

# Matrix with all imputed values through all iterations in one dataframe
imputation_data_long = complete(imputed, "long")

Save imputed variables in .RData object

# Create columns with imputed data
 whole_imputed_data %<>% 
  mutate(across(c(menopausal_status, 
         performance_status_category_corrected, 
         albumin, 
         aptt,
         aptt_ratio, 
         bilirubin, 
         creatinine, 
         alkaline_phosphatase, 
         inr, 
         leukocytes, 
         leukocytosis,
         tobacco_use,
         copd,
         venous_insufficiency,
         progression_according_to_clinical_stage, 
         t_stage, 
         n_stage, 
         histology_type, 
         mucinous_histology, 
         grade_histological_differentiation, 
         metastasis_dx, 
         catheter_device, 
         khorana_risk_score, 
         tic_onco), ~ .x, .names = "{col}_imp")) %>%
  select(id | ends_with("_imp")) 
# Save results in RData format
# save(whole_imputed_data, file = "oncoth1_whole_imputed_data.RData")